# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import plotly.express as px
from geopy.distance import lonlat, distance
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
#Loading data set and displaying the first few rows
df = pd.read_csv('201902-fordgobike-tripdata.csv')
df.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.1450 | 2019-03-01 08:01:55.9750 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | No |
| 1 | 42521 | 2019-02-28 18:53:21.7890 | 2019-03-01 06:42:03.0560 | 23.0 | The Embarcadero at Steuart St | 37.791464 | -122.391034 | 81.0 | Berry St at 4th St | 37.775880 | -122.393170 | 2535 | Customer | NaN | NaN | No |
| 2 | 61854 | 2019-02-28 12:13:13.2180 | 2019-03-01 05:24:08.1460 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | No |
| 3 | 36490 | 2019-02-28 17:54:26.0100 | 2019-03-01 04:02:36.8420 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | No |
| 4 | 1585 | 2019-02-28 23:54:18.5490 | 2019-03-01 00:20:44.0740 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | Yes |
#Information about the data set
display(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 183412 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 183412 non-null int64 1 start_time 183412 non-null object 2 end_time 183412 non-null object 3 start_station_id 183215 non-null float64 4 start_station_name 183215 non-null object 5 start_station_latitude 183412 non-null float64 6 start_station_longitude 183412 non-null float64 7 end_station_id 183215 non-null float64 8 end_station_name 183215 non-null object 9 end_station_latitude 183412 non-null float64 10 end_station_longitude 183412 non-null float64 11 bike_id 183412 non-null int64 12 user_type 183412 non-null object 13 member_birth_year 175147 non-null float64 14 member_gender 175147 non-null object 15 bike_share_for_all_trip 183412 non-null object dtypes: float64(7), int64(2), object(7) memory usage: 22.4+ MB
None
# start_time and end_time to datetime64
df.start_time = pd.to_datetime(df['start_time'])
df.end_time = pd.to_datetime(df['end_time'])
# bike_share_for_all_trip to boolean
df.bike_share_for_all_trip = df.bike_share_for_all_trip == 'Yes'
#Checking to see if there are any missing values
display(df.isnull().sum())
duration_sec 0 start_time 0 end_time 0 start_station_id 197 start_station_name 197 start_station_latitude 0 start_station_longitude 0 end_station_id 197 end_station_name 197 end_station_latitude 0 end_station_longitude 0 bike_id 0 user_type 0 member_birth_year 8265 member_gender 8265 bike_share_for_all_trip 0 dtype: int64
#Checking to see if there are any duplicated values
display(df.duplicated().sum())
0
#Dropping all the missing values and saving the result in a new data frame and displaying its info
df_clean = df.dropna()
display(df_clean.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null object 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null object 15 bike_share_for_all_trip 174952 non-null bool dtypes: bool(1), datetime64[ns](2), float64(7), int64(2), object(4) memory usage: 21.5+ MB
None
df_clean.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | end_station_longitude | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | -122.402923 | 4902 | Customer | 1984.0 | Male | False |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | -122.404904 | 5905 | Customer | 1972.0 | Male | False |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | -122.444293 | 6638 | Subscriber | 1989.0 | Other | False |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | -122.248780 | 4898 | Subscriber | 1974.0 | Male | True |
| 5 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | -122.405950 | 5200 | Subscriber | 1959.0 | Male | False |
After cleaning the null values, the dataset contains 174,952 rows/entries and 16 columns/features: ('duration_sec', 'start_time', 'end_time', 'start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type', 'member_birth_year', 'member_gender', 'bike_share_for_all_trip').
2 features in datetime format: ('start_time', 'end_time')
1 boolean format: ('bike_share_for_all_trip')
4 string/object format: ('start_station_name', 'end_station_name', 'user_type', 'member_gender' )
9 numerical format: ('duration_sec', 'start_station_id', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_latitude', 'end_station_longitude', 'bike_id', 'member_birth_year').
I am interested in knowing how long does the average trip take, and what are the factors that most affect the trip duration.
User type, gender, age could be very helpful, as well as the start and end stations.
#Calculating trips' distances in meters from the start and end stations coordinates
def to_distance(df_name, start_lat_col, start_long_col, end_lat_col, end_long_col):
"""This function takes the latitude and longitude coordinates of the start and end deistination and returns a list of distances in meters."""
start_lat = list(df_name[start_lat_col])
start_long = list(df_name[start_long_col])
end_lat = list(df_name[end_lat_col])
end_long = list(df_name[end_long_col])
distances = []
for i in range(len(start_lat)):
start = (round(start_long[i], 6), round(start_lat[i], 6))
end = (round(end_long[i], 6), round(end_lat[i], 6))
d = distance(lonlat(*start), lonlat(*end)).m
distances.append(d)
return distances
#adding duration_mins, start_hour, end_hour, distance_meters and member_age columns
df_clean['duration_mins'] = df_clean.duration_sec/60
df_clean['start_hour'] = df_clean['start_time'].dt.hour
df_clean['end_hour'] = df_clean['end_time'].dt.hour
df_clean['distance_meters'] = to_distance(df_clean, 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude')
df_clean['member_age'] = 2019 - df_clean['member_birth_year']
<ipython-input-10-69ebcb506cc7>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_clean['duration_mins'] = df_clean.duration_sec/60 <ipython-input-10-69ebcb506cc7>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_clean['start_hour'] = df_clean['start_time'].dt.hour <ipython-input-10-69ebcb506cc7>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_clean['end_hour'] = df_clean['end_time'].dt.hour <ipython-input-10-69ebcb506cc7>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_clean['distance_meters'] = to_distance(df_clean, 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude') <ipython-input-10-69ebcb506cc7>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_clean['member_age'] = 2019 - df_clean['member_birth_year']
#Displaying the info with the new added columns
display(df_clean.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 174952 entries, 0 to 183411 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 duration_sec 174952 non-null int64 1 start_time 174952 non-null datetime64[ns] 2 end_time 174952 non-null datetime64[ns] 3 start_station_id 174952 non-null float64 4 start_station_name 174952 non-null object 5 start_station_latitude 174952 non-null float64 6 start_station_longitude 174952 non-null float64 7 end_station_id 174952 non-null float64 8 end_station_name 174952 non-null object 9 end_station_latitude 174952 non-null float64 10 end_station_longitude 174952 non-null float64 11 bike_id 174952 non-null int64 12 user_type 174952 non-null object 13 member_birth_year 174952 non-null float64 14 member_gender 174952 non-null object 15 bike_share_for_all_trip 174952 non-null bool 16 duration_mins 174952 non-null float64 17 start_hour 174952 non-null int64 18 end_hour 174952 non-null int64 19 distance_meters 174952 non-null float64 20 member_age 174952 non-null float64 dtypes: bool(1), datetime64[ns](2), float64(10), int64(4), object(4) memory usage: 28.2+ MB
None
#Desciption of the dataset
display(df_clean.describe())
| duration_sec | start_station_id | start_station_latitude | start_station_longitude | end_station_id | end_station_latitude | end_station_longitude | bike_id | member_birth_year | duration_mins | start_hour | end_hour | distance_meters | member_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 | 174952.000000 |
| mean | 704.002744 | 139.002126 | 37.771220 | -122.351760 | 136.604486 | 37.771414 | -122.351335 | 4482.587555 | 1984.803135 | 11.733379 | 13.456165 | 13.609533 | 1689.941046 | 34.196865 |
| std | 1642.204905 | 111.648819 | 0.100391 | 0.117732 | 111.335635 | 0.100295 | 0.117294 | 1659.195937 | 10.118731 | 27.370082 | 4.734282 | 4.748029 | 1096.527175 | 10.118731 |
| min | 61.000000 | 3.000000 | 37.317298 | -122.453704 | 3.000000 | 37.317298 | -122.453704 | 11.000000 | 1878.000000 | 1.016667 | 0.000000 | 0.000000 | 0.000000 | 18.000000 |
| 25% | 323.000000 | 47.000000 | 37.770407 | -122.411901 | 44.000000 | 37.770407 | -122.411647 | 3799.000000 | 1980.000000 | 5.383333 | 9.000000 | 9.000000 | 909.773856 | 27.000000 |
| 50% | 510.000000 | 104.000000 | 37.780760 | -122.398279 | 101.000000 | 37.781010 | -122.397437 | 4960.000000 | 1987.000000 | 8.500000 | 14.000000 | 14.000000 | 1428.179296 | 32.000000 |
| 75% | 789.000000 | 239.000000 | 37.797320 | -122.283093 | 238.000000 | 37.797673 | -122.286533 | 5505.000000 | 1992.000000 | 13.150000 | 17.000000 | 18.000000 | 2224.683656 | 39.000000 |
| max | 84548.000000 | 398.000000 | 37.880222 | -121.874119 | 398.000000 | 37.880222 | -121.874119 | 6645.000000 | 2001.000000 | 1409.133333 | 23.000000 | 23.000000 | 69465.935316 | 141.000000 |
df_clean.head()
| duration_sec | start_time | end_time | start_station_id | start_station_name | start_station_latitude | start_station_longitude | end_station_id | end_station_name | end_station_latitude | ... | bike_id | user_type | member_birth_year | member_gender | bike_share_for_all_trip | duration_mins | start_hour | end_hour | distance_meters | member_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52185 | 2019-02-28 17:32:10.145 | 2019-03-01 08:01:55.975 | 21.0 | Montgomery St BART Station (Market St at 2nd St) | 37.789625 | -122.400811 | 13.0 | Commercial St at Montgomery St | 37.794231 | ... | 4902 | Customer | 1984.0 | Male | False | 869.750000 | 17 | 8 | 544.024930 | 35.0 |
| 2 | 61854 | 2019-02-28 12:13:13.218 | 2019-03-01 05:24:08.146 | 86.0 | Market St at Dolores St | 37.769305 | -122.426826 | 3.0 | Powell St BART Station (Market St at 4th St) | 37.786375 | ... | 5905 | Customer | 1972.0 | Male | False | 1030.900000 | 12 | 5 | 2705.440538 | 47.0 |
| 3 | 36490 | 2019-02-28 17:54:26.010 | 2019-03-01 04:02:36.842 | 375.0 | Grove St at Masonic Ave | 37.774836 | -122.446546 | 70.0 | Central Ave at Fell St | 37.773311 | ... | 6638 | Subscriber | 1989.0 | Other | False | 608.166667 | 17 | 4 | 260.862242 | 30.0 |
| 4 | 1585 | 2019-02-28 23:54:18.549 | 2019-03-01 00:20:44.074 | 7.0 | Frank H Ogawa Plaza | 37.804562 | -122.271738 | 222.0 | 10th Ave at E 15th St | 37.792714 | ... | 4898 | Subscriber | 1974.0 | Male | True | 26.416667 | 23 | 0 | 2411.980625 | 45.0 |
| 5 | 1793 | 2019-02-28 23:49:58.632 | 2019-03-01 00:19:51.760 | 93.0 | 4th St at Mission Bay Blvd S | 37.770407 | -122.391198 | 323.0 | Broadway at Kearny | 37.798014 | ... | 5200 | Subscriber | 1959.0 | Male | False | 29.883333 | 23 | 0 | 3328.332818 | 60.0 |
5 rows × 21 columns
%> In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
%> Make sure that, after every plot or related series of plots, that you include a Markdown cell with comments about what you observed, and what you plan on investigating next.
Distribution of the trips' durations
px.histogram(df_clean, x='duration_sec', range_x=[-500, 10000],
template='plotly_dark', title='Historgam of Trip Durations',
labels={'duration_sec':'Duration (Seconds)', 'count':'Trips Count'},
log_y=True,)
The plot shows that the duration values are condensed between 0 and 4000 seconds, with a peak at around 350 seconds.
Distribution of the member age
px.histogram(df_clean, x='member_age', range_x=[10, 100],
template='plotly_dark', title='Historgam members age',
labels={'member_age':'Age (Years)'})
The plot shows that the age values are condensed between 20 and 40 years.
Distribution of the gender
px.histogram(df_clean, x='member_gender',
template='plotly_dark', title='Historgam members gender',
labels={'member_gender':'Genders'})
The plot shows that the majority of the members are males.
Distribution of the customer type
px.histogram(df_clean, x='user_type',
template='plotly_dark', title='Historgam User Type',
labels={'user_type':'User Type'})
The plot shows that the majority of the customers are subscribers.
Distribution of the trips' distance
px.histogram(df_clean, x='distance_meters', range_x=[-500, 10000],
template='plotly_dark', title='Trips distances Histo0rgam',
labels={'distance_meters':'Distance (Meters)'})
The plot shows that the distance values are condensed between 200 and 2000 meters.
Distribution of start/end hours
px.histogram(df_clean, x='start_hour',
template='plotly_dark', title='Historgam Start Hour',
labels={'start_hour':'Hour of Day'})
px.histogram(df_clean, x='end_hour',
template='plotly_dark', title='Historgam End Hour',
labels={'end_hour':'Hour of Day'})
The plot shows that most trips start and ends at 8 AM and 5 PM (Start and end of the working day)
The trip duration values are condensed between 0 and 4000 seconds, with a peak at around 350 seconds. The original histogram was highly skewed to the right, so a log scale was used to fix this.
The distribution of the start hour and end hour were very close to each other, they both have peaks at 8AM and 5PM which are the start and end hours of the working day, also the age histogram showed peaks at 26 and 31 years, the age was calculated by subtracting the member birth year from the year of the dataset collection (2019), finally, the distance distribution showed that the values are condensed between 200 and 2000 meters, the distance of the trip was calculated using the start and end stations latitude and longitude coordinates.
%> In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
Duration vs Age
px.scatter(df_clean, x='member_age', y='duration_sec', range_x=[10, 100], range_y=[-500, 40000], template='plotly_dark', title='Duration vs Age',
labels={'member_age':'Age (Years)', 'duration_sec':'Duration (Seconds)'})
The plot shows that the when the age between 20 to 50, the trip duration is higher than the older ages.
Duration vs Distance
px.scatter(df_clean, x='distance_meters', y='duration_sec', range_x=[-500, 10000], range_y=[-500, 10000], template='plotly_dark', title='Duration vs Age',
labels={'distance_meters':'Distance (Meters)', 'duration_sec':'Duration (Seconds)'})
The above scatter plot shows that the duration has a high relation with distance.
Duration vs Gender
px.box(df_clean, x='member_gender', y='duration_sec', range_y=[-300, 3000], template='plotly_dark', title='Duration vs Gender',
labels={'member_gender':'Gender', 'duration_sec':'Duration (Seconds)'})
The duration vs gender box plot shows that all the genders affect the duration in the same manner.
Duration vs User Type
px.box(df_clean, x='user_type', y='duration_sec', range_y=[-300, 3000], template='plotly_dark', title='Duration vs User Type',
labels={'user_type':'User Type', 'duration_sec':'Duration (Seconds)'})
The duration vs type box plot shows that the duration of the trip is longer for customers than for subscribers.
Trip Duration is highly dependent on the age of the member, when the age between 20 to 50, the trip duration is higher than the older ages, also the duration has a high relation with distance, finally, we can observe that the Customer category go in longer trips than the subscribers.
The gender didn't seem to affect the duration of the trips that much.
%> Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
Duration, Age and Gender
px.scatter(df_clean, x='member_age', y='duration_sec', color='member_gender', range_x=[10, 100], range_y=[-500, 40000], template='plotly_dark', title='Duration vs Age vs Gender',
labels={'member_age':'Age (Years)', 'duration_sec':'Duration (Seconds)', 'member_gender': 'Gender'})
The duration, age, gender scatter plot shows that the females between the age of 20 and 50 took more and longer trips than the other genders.
Duration, Age and User Type
px.scatter(df_clean, x='member_age', y='duration_sec', color='user_type', range_x=[10, 100], range_y=[-500, 40000], template='plotly_dark', title='Duration vs Age vs User Type',
labels={'member_age':'Age (Years)', 'duration_sec':'Duration (Seconds)', 'user_type': 'User Type'})
The above plot shows that the subscribers the trip duration is higher than customer for older age.
From previous plots. we know that the majority of users are males, but now we know that the females and others affect the trips' durations more.
Subscribers the trip duration is higher than customer for older age.